## Loading required package: digest
## Loading required package: tibble
## Loading required package: ggplot2
## Project name: 01.protein-seq-evo-v1
## Loading project configuration
## Autoloading packages
## Loading package: dplyr
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading package: stringr
## Loading required package: stringr
## Loading package: readr
## Loading required package: readr
## Loading package: ggplot2
## Loading package: tidyr
## Loading required package: tidyr
## Loading package: patchwork
## Loading required package: patchwork
## Loading package: gridExtra
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading package: GGally
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading package: readxl
## Loading required package: readxl
## Loading package: viridis
## Loading required package: viridis
## Loading required package: viridisLite
## Loading package: cowplot
## Loading required package: cowplot
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
## Loading package: ggsignif
## Loading required package: ggsignif
## Loading package: minpack.lm
## Loading required package: minpack.lm
## Loading package: purrr
## Loading required package: purrr
## Loading package: scales
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:viridis':
##
## viridis_pal
## The following object is masked from 'package:readr':
##
## col_factor
## Loading package: bigsnpr
## Loading required package: bigsnpr
## Loading required package: bigstatsr
## Loading package: drc
## Loading required package: drc
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
##
## 'drc' has been loaded.
## Please cite R and 'drc' if used for a publication,
## for references type 'citation()' and 'citation('drc')'.
##
## Attaching package: 'drc'
## The following objects are masked from 'package:stats':
##
## gaussian, getInitial
## Loading package: data.table
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## Autoloading helper functions
## Running helper script: globals.R
## Running helper script: helpers.R
## Autoloading data
## Munging data
## Running preprocessing script: 01-util.R
## Sourcing R script: 01-util.R
## Running preprocessing script: 02-munge_kras.R
## Sourcing R script: 02-munge_kras.R
## Rows: 189 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3591 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3572 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (11): column_coverage, popEVE, pop-adjusted_ESM1v, pop-adjusted_EVH_epis...
## lgl (3): pop-adjusted_EVE, pop-adjusted_Tranception, EVE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3780 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): wt_aa, mt_aa, ClinVar_ClinicalSignificance, Starred_Coarse_Grained...
## dbl (11): position, Gold_Stars, NumberSubmitters, frequency_gv2, frequency_g...
## lgl (14): BS1, PM2, PM5, PP5, BP6, b_model, p_model, b_acmg_model, lb_acmg_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2587 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): id, wt_aa, mt_aa, bp_interface, binding_RAF, binding_RAL, binding_...
## dbl (20): Pos_real, abundance_ddg, abundance_ddg_std, pik3cg_ddg, pik3cg_ddg...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3453 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_codon
## dbl (4): Pos_real, mean_kcal/mol, std_kcal/mol, ESM1v
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Running preprocessing script: 03-munge_src.R
##
## Sourcing R script: 03-munge_src.R
##
## Rows: 5112 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: " "
## chr (1): id
## dbl (8): FL_activity_mean_kcal/mol, FL_activity_std_kcal/mol, FL_folding_mea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): SEQ, ATOM, COLOR, CONFIDENCE INTERVAL, MSA DATA, RESIDUE VARIETY
## dbl (2): POS, SCORE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): X1, X2, X4
## dbl (1): X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10720 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): variant
## dbl (2): score, pos
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 10184 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): mutant
## dbl (14): column_coverage, popEVE, pop-adjusted_EVE, pop-adjusted_ESM1v, pop...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 536 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): WT_AA, domains_Pfam, KD_lobe, secondary_structure_uniprot, seconda...
## dbl (1): position
## lgl (11): block1, block2, block3, block4, block5, R-spine, C-spine, Communit...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4898 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): id, wt_aa
## dbl (9): FL_kinase_fitness_scaled, FL_kinase_sigma_scaled, FL_abundance_fitn...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Running preprocessing script: 04-munge_psd95.R
##
## Sourcing R script: 04-munge_psd95.R
##
## Rows: 3154 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): id_eve, id_old, trait_name, library, assay, pdz_name, alignment_po...
## dbl (26): X, V1, pos_am, ddg, std_ddg, ci95_kcal.mol, pdz, structural_alignm...
## lgl (1): binding_interface_5A
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Running preprocessing script: 05-munge-grb2.R
##
## Sourcing R script: 05-munge-grb2.R
##
## Rows: 1056 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): id, old_id, id_ref, SS, Pos_class, protein, WT_AA, Mut, wt_aa.x, m...
## dbl (23): Pos_real, Pos_ref, Pos, mut_order, f_dg_pred, f_ddg_pred, f_ddg_pr...
## lgl (5): f_ddg_pred_conf, b_ddg_pred_conf, allosteric, orthosteric, alloste...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1689 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): id, protein, pca_type, aa_seq, old_id, wt_aa.x, mt_aa, wt_aa.y, at...
## dbl (14): Pos_real, Nham_aa, fitness, sigma, growthrate, growthrate_sigma, c...
## lgl (1): WT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gck_abundance <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/HXK4_HUMAN_Gersing_2023_abundance.csv", header = TRUE)
gck_activity <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_ddg/proteinGym/HXK4_HUMAN_Gersing_2022_activity.csv", header = TRUE)
gck_esm <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_esm1v/P35557.csv")
nrow(gck_abundance) #8396
## [1] 8396
nrow(gck_activity) #8570
## [1] 8570
nrow(gck_esm) #8835
## [1] 8835
gck_df <- merge(gck_abundance, gck_esm, by.x = "mutant", by.y = "variant")
gck_df <- gck_df %>% dplyr::select (mutant, DMS_score, DMS_score_bin, ESM.1v) %>%
dplyr::rename(DMS_score_abundance = DMS_score,
DMS_score_bin_abundance = DMS_score_bin)
gck_df <- merge(gck_df, gck_activity, by = "mutant")
gck_df <- gck_df %>% dplyr::rename(DMS_score_activity = DMS_score,
DMS_score_bin_activity = DMS_score_bin) %>%
dplyr::select (mutant, DMS_score_abundance, DMS_score_bin_abundance, ESM.1v,
DMS_score_activity, DMS_score_bin_activity)
gck_df <- gck_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
gck_clinvar <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/P35557_clinvar_GCK.csv")
nrow(gck_clinvar) #8835
## [1] 8835
gck_df <- merge(gck_df, gck_clinvar, by.x="mutant", by.y="variant")
gck_clinvar_web <- read.delim("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_clinvar/GCK_clinvar_web_042925.txt")
nrow(gck_clinvar_web) #319
## [1] 319
table(gck_clinvar_web$Variant.type)
##
## single nucleotide variant
## 319
# single nucleotide variant
# 319
# Map from 3-letter to 1-letter amino acid codes
aa_map <- c(
Ala="A", Arg="R", Asn="N", Asp="D", Cys="C",
Gln="Q", Glu="E", Gly="G", His="H", Ile="I",
Leu="L", Lys="K", Met="M", Phe="F", Pro="P",
Ser="S", Thr="T", Trp="W", Tyr="Y", Val="V",
Ter="*"
)
# Extract long-form protein change (e.g., Ala456Val)
gck_clinvar_web$canonical_long <- sub(".*\\(p\\.([A-Za-z]+\\d+[A-Za-z]+)\\).*", "\\1", gck_clinvar_web$Name)
# Convert long form to short form using aa_map
gck_clinvar_web$canonical_short <- sapply(gck_clinvar_web$canonical_long, function(change) {
if (grepl("^([A-Za-z]{3})(\\d+)([A-Za-z]{3})$", change)) {
parts <- regmatches(change, regexec("^([A-Za-z]{3})(\\d+)([A-Za-z]{3})$", change))[[1]]
from <- aa_map[[parts[2]]]
to <- aa_map[[parts[4]]]
pos <- parts[3]
if (!is.null(from) && !is.null(to)) {
return(paste0(from, pos, to))
}
}
return(NA)
})
gck_clinvar_web <- gck_clinvar_web %>% dplyr::select( canonical_short, Condition.s., Germline.classification,Protein.change)
nrow(gck_clinvar_web) #319
## [1] 319
# Classify based on known pathogenic GCK disease mechanisms
classify_gck_conditions <- function(cond_str) {
cond_lower <- tolower(cond_str)
# Individual flags
is_mody <- grepl("maturity[- ]onset diabetes.*young", cond_lower)
is_pndm <- grepl("neonatal diabetes", cond_lower)
is_hh <- grepl("hyperinsulinism", cond_lower)
is_mono <- grepl("monogenic diabetes", cond_lower)
is_np <- grepl("not provided", cond_lower)
# Label assignment based on specific rules
labels <- c()
if (is_mody) labels <- c(labels, "MODY")
if (is_pndm) labels <- c(labels, "PNDM")
if (is_hh) labels <- c(labels, "HH")
if (is_mono) labels <- c(labels, "Monogenic diabetes")
if (is_np && length(labels) == 0) labels <- c("Not provided") # only assign if nothing else detected
if (length(labels) == 0) {
return("Other")
} else if (length(labels) == 1) {
return(labels)
} else {
return("Mixed")
}
}
# Apply the classification
gck_clinvar_web$clean_condition <- sapply(gck_clinvar_web$Condition.s., classify_gck_conditions)
# View cleaned summary
#table(gck_clinvar_web$clean_condition)
#HH Mixed MODY Monogenic diabetes Not provided Other
#3 14 69 189 41 3
gck_clinvar_web <- gck_clinvar_web %>%
mutate(canonical_short = if_else(
is.na(canonical_short),
Protein.change,
canonical_short
))
nrow(gck_clinvar_web) #319 3+14+69+189 = 275
## [1] 319
length(unique(gck_clinvar_web$canonical_short)) #313
## [1] 313
# Find duplicated canonical_short values
dup_idx <- duplicated(gck_clinvar_web$canonical_short) | duplicated(gck_clinvar_web$canonical_short, fromLast = TRUE)
# Extract rows with duplicated canonical_short values
gck_clinvar_dups <- gck_clinvar_web[dup_idx, ]
gck_clinvar_web <- gck_clinvar_web %>% distinct(canonical_short, .keep_all = TRUE)
nrow(gck_clinvar_web) #313
## [1] 313
gck_df_merged <- merge(gck_df, gck_clinvar_web, by.x="mutant", by.y="canonical_short", all.x = TRUE)
nrow(gck_df_merged) #8255
## [1] 8255
range(gck_df_merged$DMS_score_abundance) #-0.9834964 1.6096087
## [1] -0.9834964 1.6096087
range(gck_df_merged$DMS_score_activity) #-1.085214 6.670528
## [1] -1.085214 6.670528
gck_gnomad <- read.csv("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/vampseq/vampseq_gnomad/GCK_gnomAD_v4.1.0_ENSG00000106633_2025_04_09_14_05_39.csv")
gck_gnomad <- gck_gnomad %>% filter(VEP.Annotation == "missense_variant")
summary(gck_gnomad$Allele.Frequency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.195e-07 6.197e-07 6.230e-07 1.099e-05 1.859e-06 2.580e-03
nrow(gck_gnomad) #528
## [1] 528
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 6.195e-07 6.197e-07 6.230e-07 1.076e-05 1.859e-06 2.580e-03
gck_gnomad <- gck_gnomad %>% dplyr::select(HGVS.Consequence, ClinVar.Germline.Classification,
rsIDs, ClinVar.Variation.ID)
nrow(gck_gnomad) #528
## [1] 528
# Function to convert e.g. "p.Glu2Gln" → "E2Q"
convert_to_short <- function(consequence) {
consequence <- gsub("^p\\.", "", consequence) # Remove 'p.'
matches <- regmatches(consequence, regexec("([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", consequence))[[1]]
if (length(matches) == 4) {
from <- aa_map[[matches[2]]]
pos <- matches[3]
to <- aa_map[[matches[4]]]
return(paste0(from, pos, to))
} else {
return(NA)
}
}
# Apply to the column
gck_gnomad$ID <- sapply(gck_gnomad$HGVS.Consequence, convert_to_short)
nrow(gck_gnomad) #528
## [1] 528
length(unique(gck_gnomad$ID)) #519
## [1] 519
gck_gnomad <- gck_gnomad %>% distinct(ID, .keep_all = TRUE)
nrow(gck_gnomad) #519
## [1] 519
nrow(gck_df_merged) #8255
## [1] 8255
gck_df <- merge(gck_df_merged, gck_gnomad, by.x="mutant", by.y = "ID", all.x = TRUE)
nrow(gck_df) #8255
## [1] 8255
length(unique(gck_df$mutant)) #8255
## [1] 8255
#https://cspec.genome.network/cspec/ui/svi/doc/GN086
active_positions <- c(151:179, # disordered loop
151-153, 168-169, 204-206, 225-231, 254-258, 287, 290, # glucose-binding
78:85, 151, 169, 205, 225:229, 295:296, 331:333, 336, 410:416 # ATP-binding
)
fil_gck_df <- gck_df %>%
filter(!mutation_position %in% active_positions)
nrow(fil_gck_df) #7224
## [1] 7224
# Fit a loess model using the filtered data
loess_fit <- loess(DMS_score_activity ~ DMS_score_abundance, data = fil_gck_df, span = 0.7, family = "symmetric")
# Predict fitted values for ALL data points using the loess model trained on fil_gck_df
gck_df$fitted <- predict(loess_fit, newdata = gck_df)
# Calculate residuals for ALL points
gck_df$residuals <- gck_df$DMS_score_activity - gck_df$fitted
range(gck_df$residuals) #-1.891487 6.102024
## [1] -1.891487 6.102024
# Generate LOESS fit line from fil_gck_df
loess_fit <- loess(DMS_score_activity ~ DMS_score_abundance, data = fil_gck_df, span = 0.7, family = "symmetric")
fit_line_df <- data.frame(
DMS_score_abundance = seq(-0.6,
max(fil_gck_df$DMS_score_abundance, na.rm = TRUE),
length.out = 200)
)
fit_line_df$DMS_score_activity <- predict(loess_fit, newdata = fit_line_df)
#length(unique(gck_df$mutant)) #8255
range(gck_df$residuals) #-1.891487 6.102024
## [1] -1.891487 6.102024
range(gck_df$DMS_score_abundance) #-0.9834964 1.6096087
## [1] -0.9834964 1.6096087
range(gck_df$DMS_score_activity) #-1.085214 6.670528
## [1] -1.085214 6.670528
median(gck_df$DMS_score_activity) #0.56
## [1] 0.5603297
gck_df <- gck_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
nrow(gck_df)
## [1] 8255
median_residuals <- gck_df %>%
dplyr::group_by(mutation_position) %>%
summarise(median_residuals = median(residuals, na.rm = TRUE))
min(median_residuals$median_residuals) #-1.136173
## [1] -1.136173
max(median_residuals$median_residuals) #2.551111
## [1] 2.551111
pdb <- read.pdb("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/GCK/1v4s.pdb")
data <- median_residuals
# Create a new B-factor vector initialized with the current B-factors from the PDB
new_b_factors <- pdb$atom$b
# Loop through each position in the correlation data and update the B-factors
for (i in 1:nrow(data)) {
position <- data$mutation_position[i]
correlation_value <- data$median_residuals[i]
# Find indices in the PDB that match the current position
indices <- which(pdb$atom$resno == position)
# Print the indices and current B-factors before updating
#cat("Updating residue number:", position, "\n")
#cat("Indices in PDB:", indices, "\n")
#cat("Current B-factors:", new_b_factors[indices], "\n")
# Update B-factors for all atoms in the current residue
new_b_factors[indices] <- correlation_value
# Print the new B-factors after updating
#cat("Updated B-factors:", new_b_factors[indices], "\n")
#cat("\n") # Add an extra line for readability
}
# Replace non-matching B-factors with outlier value so we can filter it out in ChimeraX
non_matching_indices <- setdiff(seq_along(new_b_factors), which(pdb$atom$resno %in% data$mutation_position))
new_b_factors[non_matching_indices] <- 999
# Assign the new B-factors back to the pdb structure
# Write the modified PDB structure to a new file
pdb$atom$b <- new_b_factors
write.pdb(pdb, file = "/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/GCK/1v4s_median_residual_loess.pdb")
# Plot
nrow(gck_df)
## [1] 8255
p1 <- ggplot(gck_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2, alpha = 0.35) +
geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
geom_text_repel(data = subset(gck_df, mutant %in% c("Y214E")),
aes(label = mutant),
size = 4,color = "gold2",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_text_repel(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
shape = 17, color = "black", size = 3) + # Triangle shape
scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) +
labs(
title = "All 8255 Variants",
x = "Measured abundance",
y = "Measured activity",
color = "LOESS residuals"
) +
scale_y_continuous(breaks = seq(-1, 7, by = 2)) + xlim(-1,1.7) +
theme_classic() + theme(legend.position = "none") +theme(
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA),
legend.background = element_rect(fill = "white", color = NA)
)
p1 <- ggMarginal(
p1,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
patho_df <- gck_df %>% filter(clean_condition %in% c("HH", "MODY", "Monogenic diabetes"))
nrow(patho_df) #224
## [1] 224
p2 <- ggplot(patho_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2, shape = 17) +
geom_text_repel(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(gck_df, mutant %in% c("T206M", "V452L")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
shape = 17, color = "black", size = 3) + # Triangle shape
geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) +
labs(
title = "ClinVar Pathogenic Variants",
x = "Measured abundance",
y = "Measured activity",
color = "Loess residuals"
) +scale_y_continuous(
breaks = seq(-1, 7, by = 2),
limits = c(-1.1, 7)
) +
theme_classic() + xlim(-1,1.7) +
theme(legend.position = "none")
p2 <- ggMarginal(
p2,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
gnomad_df <- gck_df %>% filter(!is.na(HGVS.Consequence)) %>%
filter(!clinvar_clinical_significance %in% c("likely_pathogenic", "pathogenic", "conflict", "likely_risk_allele", "uncertain_risk_allele", "VUS")) %>%
filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
"Conflicting classifications of pathogenicity", "Likely pathogenic/Likely risk allele",
"Likely risk allele", "Pathogenic/Likely pathogenic/Likely risk allele",
"Uncertain significance", "Uncertain significance/Uncertain risk allele")) %>%
filter(is.na(Germline.classification))
#filter(is.na(Phenotype_Class))
#filter(is.na(somatic))
#nrow(gnomad_df) #268
p3 <- ggplot(gnomad_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2) +
geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
geom_text_repel(data = subset(gck_df, mutant %in% c("A11T", "E279Q", "R46K")),
aes(label = mutant),
size = 4,color = "black",
max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
geom_point(data = subset(gck_df, mutant %in% c("A11T", "E279Q", "R46K")),
aes(x = DMS_score_abundance, y = DMS_score_activity),
color = "black", size = 2) + # Triangle shape
scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) +
labs(
title = "gnomAD Variants",
x = "Measured abundance",
y = "Measured activity",
color = "Loess residuals"
) + theme_classic() + scale_y_continuous(breaks = seq(-1, 7, by = 2),
limits = c(-1, 7)) + xlim(-1,1.7) +theme(legend.position = "none")
p3 <- ggMarginal(
p3,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
benign_df <- gck_df %>%
filter(clinvar_clinical_significance %in% c("likely_benign", "benign")) %>%
filter(!ClinVar.Germline.Classification %in% c("Pathogenic", "Pathogenic/Likely pathogenic", "Likely pathogenic",
"Conflicting classifications of pathogenicity")) %>%
filter(is.na(Germline.classification))
#nrow(benign_df) #3
p4 <- ggplot(benign_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2) +
geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) +
labs(
title = "ClinVar Benign Variants",
x = "Measured abundance",
y = "Measured activity",
color = "Loess residuals"
) + theme_classic() + scale_y_continuous(breaks = seq(-1, 7, by = 2),
limits = c(-1, 7)) + xlim(-1,1.7) +theme(legend.position = "none")
p4 <- ggMarginal(
p4,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
p5 <- plot_grid(p1,p3,p2, nrow=1, ncol=3)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_scatter.pdf",
plot = p5, width = 9, height = 3, dpi = 300)
p5

p1 <- ggplot(benign_df, aes(x = DMS_score_abundance, y = DMS_score_activity, color = residuals)) +
geom_point(size = 2) +
geom_vline(xintercept = 0.58, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_hline(yintercept = 0.66, linetype = "dashed", linewidth = 0.5, color = "grey") +
geom_line(data = fit_line_df, aes(x = DMS_score_abundance, y = DMS_score_activity),
inherit.aes = FALSE, color = "black", linewidth = 1) +
scale_color_viridis(option = "C", direction = 1, limits = c(-6.2, 6.2)) +
labs(
title = "ClinVar Benign Variants",
x = "Measured abundance",
y = "Measured activity",
color = "Loess residuals"
) + theme_classic() + scale_y_continuous(breaks = seq(-1, 7, by = 2),
limits = c(-1, 7)) + xlim(-1,1.7) +
theme(legend.position = "bottom")
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_scatter_legend.pdf",
plot = p1, width = 3, height = 3, dpi = 300)
p1

# 1. Set up
set.seed(11)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(patho_df)
# 2. Get abundance/residuals from gck_patho
patho_df <- patho_df %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"
# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- gck_df %>%
filter(!(mutant %in% patho_df$mutant))
bootstrap_medians <- vector("numeric", length = n_boot)
# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)
# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)
for (i in 1:n_boot) {
for (j in 1:n_patho) {
bin_j <- patho_df$bin[j]
candidates <- bin_lookup[[as.character(bin_j)]]
if (!is.null(candidates) && length(candidates) > 0) {
bootstrap_matrix[i, j] <- sample(candidates, 1)
}
}
}
# Summarize into a dataframe
boot_df <- data.frame(
group = "Random abundance-matched",
residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE)
)
# 7. Combine with pathogenic residuals
plot_df <- bind_rows(
patho_df %>% dplyr::select(group, residuals),
boot_df
)
# 8. Labels for plot
label_df <- plot_df %>%
group_by(group) %>%
summarise(
n = n(),
median_val = median(residuals),
y_max = max(residuals),
.groups = "drop"
) %>%
mutate(n_label = case_when(
group == "Random abundance-matched" ~ "bootstrapped 1000 times",
TRUE ~ paste0("n = ", n)
))
# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = group, y = 3.5, label = n_label),
inherit.aes = FALSE,
size = 4) +
geom_text(
data = label_df,
aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
labs(
x = "",
y = "Activity-abundance residuals",
title = "All Variants"
) +
theme_classic(base_size = 14) +
scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
theme(legend.position = "none") +
geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
map_signif_level = FALSE,
test = "wilcox.test",
tip_length = 0.01)
p_fast

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin1.pdf",
plot = p_fast, width = 3, height = 4, dpi = 300)
p_fast

hist(boot_df$residuals, breaks = 30, main = "Bootstrap medians", xlab = "Residuals")

nrow(gnomad_df) #268
## [1] 268
patho_df <- gck_df %>% filter(!is.na(Germline.classification))
#nrow(patho_df) #277
gnomad_df <- gnomad_df %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
patho_df1 <- patho_df %>% filter(clean_condition == "HH") %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
patho_df2 <- patho_df %>% filter(clean_condition == "MODY") %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
patho_df3 <- patho_df %>% filter(clean_condition == "Monogenic diabetes") %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals)
nrow(patho_df1)
## [1] 3
nrow(patho_df2)
## [1] 57
nrow(patho_df3)
## [1] 164
patho_df1$var_class2 <- "HH"
patho_df2$var_class2 <- "MODY"
patho_df3$var_class2 <- "Monogenic diabetes"
gnomad_df$var_class2 <- "gnomAD"
wilcox.test(patho_df1$residuals, gnomad_df$residuals) #0.004004
##
## Wilcoxon rank sum test with continuity correction
##
## data: patho_df1$residuals and gnomad_df$residuals
## W = 791, p-value = 0.004004
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, gnomad_df$residuals) #3.87e-07
##
## Wilcoxon rank sum test with continuity correction
##
## data: patho_df2$residuals and gnomad_df$residuals
## W = 4368, p-value = 3.87e-07
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df3$residuals, gnomad_df$residuals) #< 2.2e-16
##
## Wilcoxon rank sum test with continuity correction
##
## data: patho_df3$residuals and gnomad_df$residuals
## W = 10458, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df2$residuals) #0.003939
##
## Wilcoxon rank sum test with continuity correction
##
## data: patho_df1$residuals and patho_df2$residuals
## W = 171, p-value = 0.003939
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df1$residuals, patho_df3$residuals) #0.003479
##
## Wilcoxon rank sum test with continuity correction
##
## data: patho_df1$residuals and patho_df3$residuals
## W = 489, p-value = 0.003479
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(patho_df2$residuals, patho_df3$residuals) #0.3465
##
## Wilcoxon rank sum test with continuity correction
##
## data: patho_df2$residuals and patho_df3$residuals
## W = 5066, p-value = 0.3465
## alternative hypothesis: true location shift is not equal to 0
combined_df <- rbind(patho_df1,patho_df2,patho_df3,gnomad_df)
median_df <- combined_df %>%
group_by(var_class2) %>%
summarise(
median_residual = median(residuals),
n = n()
)
combined_df$var_class2 <- factor(
combined_df$var_class2,
levels = c("gnomAD", "HH", "MODY", "Monogenic diabetes")
)
custom_colors <- c(
"gnomAD" = "#1b9e77", # Teal green
"HH" = "#f4a6b3", # Warm orange
"MODY" = "#e7298a", # Muted purple
"Monogenic diabetes" = "#7570b3" # Hot pink / magenta
)
p10 <- ggplot(combined_df, aes(x = var_class2, y = residuals, fill = var_class2)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_class2, y = median_residual + 0.5, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_class2, y = 3, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "Across Groups",
x = "",
y = "Activity-abundance residuals",
fill = ""
) +
theme_classic(base_size = 14) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 20, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "HH"),
c("gnomAD", "MODY"),
c("gnomAD", "Monogenic diabetes")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin2.pdf",
plot = p10, width = 3, height = 4, dpi = 300)
p10

#nrow(gnomad_df) #268
#nrow(patho_df) #277
#length(unique(patho_df$mutant)) #277
#table(patho_df$clean_condition)
#HH Mixed MODY Monogenic diabetes Not provided Other
#3 14 57 164 36 3
patho_df <- patho_df %>% filter(clean_condition %in% c("HH","MODY","Monogenic diabetes") )
gnomad_df$var_source <- "gnomAD"
patho_df$var_source <- "Pathogenic"
gnomad_df <- gnomad_df %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
patho_df <- patho_df %>% dplyr::select(mutant, residuals, var_source,DMS_score_abundance)
combined_df <- rbind(gnomad_df, patho_df)
combined_df <- combined_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
combined_df_int <- combined_df %>% filter(mutation_position %in% active_positions)
nrow(combined_df_int) #71
## [1] 71
#length(unique(combined_df_int$mutant)) #71
median_df <- combined_df_int %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
#median_df
#table(combined_df_int$var_source)
custom_colors <- c(
"gnomAD" = "#1b9e77", # Teal green (unchanged)
"Pathogenic" = "#de425b" # Deep ocean blue
)
p13 <- ggplot(combined_df_int, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 3, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "71 Orthosteric Variants",
x = "",
y = "Activity_abundance residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none"
) +
geom_signif(
comparisons = list(c("gnomAD", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1, y_position = 3.5,
textsize = 3
) + ylim(-5, 5)
p13

combined_df_out <- combined_df %>% filter(!mutation_position %in% active_positions)
nrow(combined_df_out) #421
## [1] 421
median_df <- combined_df_out %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
#median_df
p14 <- ggplot(combined_df_out, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 3, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "421 Non-orthosteric Variants",
x = "",
y = "Activity-abundance residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none"
) +
geom_signif(
comparisons = list(c("gnomAD", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1, y_position = 3.5,
textsize = 3
) + ylim(-5, 5)
p14

combined_df_stable <- combined_df %>% filter(DMS_score_abundance > 0.58)
nrow(combined_df_stable) #350
## [1] 350
#table(combined_df_stable$var_source)
combined_df_stable_int <- combined_df_stable %>% filter(mutation_position %in% active_positions)
nrow(combined_df_stable_int) #59
## [1] 59
median_df <- combined_df_stable_int %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
#median_df
p15 <- ggplot(combined_df_stable_int, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 3, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "59 WT-abundance Orthosteric Variants",
x = "",
y = "Activity-abundance residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none"
) +
geom_signif(
comparisons = list(c("gnomAD", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1, y_position = 3.5,
textsize = 3
) + ylim(-5, 5)
combined_df_stable_out <- combined_df_stable %>% filter(!mutation_position %in% active_positions)
nrow(combined_df_stable_out) #291
## [1] 291
median_df <- combined_df_stable_out %>%
group_by(var_source) %>%
summarise(
median_residual = median(residuals, na.rm = TRUE),
n = n()
)
#median_df
p16 <- ggplot(combined_df_stable_out, aes(x = var_source, y = residuals, fill = var_source)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_source, y = median_residual + 1, label = sprintf("%.2f", median_residual)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors) +
geom_text(
data = median_df,
aes(x = var_source, y = 3, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4.5
) +
labs(
title = "291 WT-abundance Non-orthosteric Variants",
x = "",
y = "Activity-abundance residuals",
fill = ""
) +
theme_classic() +
theme(
legend.position = "none"
) +
geom_signif(
comparisons = list(c("gnomAD", "Pathogenic")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1, y_position = 3.5,
textsize = 3
) + ylim(-5, 5)
p17 <- plot_grid(p13,p14,p15,p16,ncol=4,nrow=1)
p17

p18 <- plot_grid(p13,p14,ncol=2,nrow=1)
p19 <- plot_grid(p15,p16,ncol=2,nrow=1)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin3.pdf",
plot = p18, width = 5, height = 3, dpi = 300)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin4.pdf",
plot = p19, width = 5, height = 3, dpi = 300)
p18

p19

pdb <- read.pdb("~/Documents/0.Projects/01.protein-seq-evo-v1/data/residual_pdb/GCK/1v4s.pdb")
#unique(pdb$atom$resid)
# Extract C-alpha atoms from protein (exclude ligand)
protein_ca <- pdb$atom[pdb$atom$elety == "CA" & pdb$atom$resid != "GLC" & pdb$atom$resid != "MRK" & pdb$atom$resid != "HOH", ]
# Extract ligand atoms (TLA residue)
ligand_atoms <- pdb$atom[pdb$atom$resid == "GLC" & pdb$atom$type == "HETATM", ]
# Calculate min distance from each C-alpha to the ligand
protein_ca$min_dist_to_ligand <- apply(protein_ca, 1, function(atom) {
dists <- sqrt((as.numeric(atom[["x"]]) - as.numeric(ligand_atoms$x))^2 +
(as.numeric(atom[["y"]]) - as.numeric(ligand_atoms$y))^2 +
(as.numeric(atom[["z"]]) - as.numeric(ligand_atoms$z))^2)
return(min(dists))
})
# View summary
#nrow(protein_ca) #448
#summary(protein_ca$min_dist_to_ligand)
#head(protein_ca$min_dist_to_ligand)
#length(protein_ca$min_dist_to_ligand) #448
#nrow(gck_df_merged)
fil_protein_ca <- protein_ca %>% dplyr::select(resid, resno, min_dist_to_ligand)
active_positions <- c(151:179, # disordered loop
151-153, 168-169, 204-206, 225-231, 254-258, 287, 290, # glucose-binding
78:85, 151, 169, 205, 225:229, 295:296, 331:333, 336, 410:416 # ATP-binding
)
active_sites <- c(151:179, # disordered loop
78:85, 151, 169, 205, 225:229, 295:296, 331:333, 336, 410:416) # ATP-binding
binding_sites <- c(151:153, 168:169, 204:206, 225:231, 254:258, 287, 290) # glucose-binding)
merged_df <- merge(gck_df, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#nrow(merged_df) #8255
merged_df <- merged_df %>% dplyr::select(-sequence)
nrow(merged_df)
## [1] 8255
merged_df <- merged_df[!is.na(merged_df$min_dist_to_ligand),]
nrow(merged_df) #7969
## [1] 7969
gnomad_df <- merged_df %>% filter(mutant %in% gnomad_df$mutant)
nrow(gnomad_df)
## [1] 251
patho_df1 <- merged_df %>% filter(mutant %in% patho_df1$mutant)
nrow(patho_df1)
## [1] 3
patho_df2 <- merged_df %>% filter(mutant %in% patho_df2$mutant)
nrow(patho_df2)
## [1] 57
patho_df3 <- merged_df %>% filter(mutant %in% patho_df3$mutant)
nrow(patho_df3)
## [1] 164
gnomad_df <- gnomad_df %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df1 <- patho_df1 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df2 <- patho_df2 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df3 <-patho_df3 %>% dplyr::select(mutant, DMS_score_abundance, DMS_score_activity, residuals, min_dist_to_ligand)
patho_df1$var_class2 <- "HH"
patho_df2$var_class2 <- "MODY"
patho_df3$var_class2 <- "Monogenic diabetes"
gnomad_df$var_class2 <- "gnomAD"
combined_df <- rbind(patho_df1,patho_df2,patho_df3,gnomad_df)
median_df <- combined_df %>%
group_by(var_class2) %>%
summarise(
median_dist = median(min_dist_to_ligand),
n = n()
)
median_df
## # A tibble: 4 × 3
## var_class2 median_dist n
## <chr> <dbl> <int>
## 1 HH 16.6 3
## 2 MODY 16.6 57
## 3 Monogenic diabetes 15.6 164
## 4 gnomAD 21.4 251
combined_df$var_class2 <- factor(
combined_df$var_class2,
levels = c("gnomAD", "HH", "MODY", "Monogenic diabetes")
)
custom_colors <- c(
"gnomAD" = "#1b9e77", # Teal green
"HH" = "#f4a6b3", # Warm orange
"MODY" = "#e7298a", # Muted purple
"Monogenic diabetes" = "#7570b3" # Hot pink / magenta
)
combined_df <- combined_df %>%
mutate(mutation_position = as.numeric(str_extract(mutant, "(?<=\\D)(\\d+)(?=\\D)")))
combined_df$site_type <- "Non-orthosteric site"
combined_df$site_type[combined_df$mutation_position %in% active_sites] <- "Active site"
combined_df$site_type[combined_df$mutation_position %in% binding_sites] <- "Binding site"
table(combined_df$site_type)
##
## Active site Binding site Non-orthosteric site
## 48 26 401
nrow(combined_df)
## [1] 475
p1 <- ggplot(combined_df, aes(x = var_class2, y = min_dist_to_ligand, fill = var_class2)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(aes(shape = site_type), width = 0.15, size = 2, alpha = 0.7,
fill = "lightgrey", color = "darkgrey", stroke = 0.5)+
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_class2, y = median_dist + 1, label = sprintf("%.2f", median_dist)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors, guide = "none") +
scale_shape_manual(values = c(
"Non-orthosteric site" = 21,
"Active site" = 22,
"Binding site" = 23
)) +
geom_text(
data = median_df,
aes(x = var_class2, y = 40, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
labs(
title = "475 Variants",
x = "",
y = "Distance to glucose",
fill = "",
shape = "Site Type"
) +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "HH"),
c("gnomAD", "MODY"),
c("gnomAD", "Monogenic diabetes")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
)
p1

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin5.pdf",
plot = p1, width = 3, height = 4, dpi = 300)
p1

ggplot(combined_df, aes(x = var_class2, y = min_dist_to_ligand, fill = var_class2)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(aes(shape = site_type), width = 0.15, size = 2, alpha = 0.7,
fill = "lightgrey", color = "darkgrey", stroke = 0.5)+
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2, fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = median_df,
aes(x = var_class2, y = median_dist + 1, label = sprintf("%.2f", median_dist)),
inherit.aes = FALSE,
size = 5
) +
scale_fill_manual(values = custom_colors, guide = "none") +
scale_shape_manual(values = c(
"Non-orthosteric site" = 21,
"Active site" = 22,
"Binding site" = 23
)) +
geom_text(
data = median_df,
aes(x = var_class2, y = 40, label = paste0("n = ", n)),
inherit.aes = FALSE,
size = 4
) +
labs(
title = "475 Variants",
x = "",
y = "Distance to glucose",
fill = "",
shape = "Site Type"
) +
theme_classic() +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1)
) +
geom_signif(
comparisons = list(c("gnomAD", "HH"),
c("gnomAD", "MODY"),
c("gnomAD", "Monogenic diabetes")),
map_signif_level = FALSE,
test = "wilcox.test",
step_increase = 0.1,
textsize = 3
)

nrow(merged_df) #7969
## [1] 7969
merged_df_neg <- merged_df %>% filter(residuals <=0)
nrow(merged_df_neg) #4417
## [1] 4417
merged_df_residue <- merged_df_neg %>%
group_by(mutation_position) %>%
summarise(
loess_residual_avg = median(residuals, na.rm = TRUE))
merged_df_residue <- merge(merged_df_residue, protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
merged_df_residue <- merged_df_residue[!is.na(merged_df_residue$min_dist_to_ligand), ]
nrow(merged_df_residue) #445
## [1] 445
# Fit exponential decay: y = a * exp(-b * x)
exp_model <- nls(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
data = merged_df_residue,
start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
## model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
## data: merged_df_residue
## a b
## 0.9493 0.0438
## residual sum-of-squares: 19.54
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 8.104e-07
# Nonlinear regression model
# model: abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand)
# data: merged_df_residue
# a b
# 0.9493 0.0438
# residual sum-of-squares: 19.54
#
# Number of iterations to convergence: 4
# Achieved convergence tolerance: 8.104e-07
a <- coef(exp_model)[["a"]]
b <- coef(exp_model)[["b"]]
merged_df_neg <- merged_df_neg %>%
mutate(expected_res = a * exp(-b * min_dist_to_ligand),
residual = abs(residuals) - expected_res)
# One-sided Wilcoxon signed-rank test for each residue
wilcox_test_results <- merged_df_neg %>%
group_by(mutation_position) %>%
summarise(
p_wilcox = tryCatch(wilcox.test(residual, mu = 0, alternative = "greater")$p.value, error = function(e) NA_real_),
.groups = "drop"
) %>%
mutate(p_adj = p.adjust(p_wilcox, method = "fdr"))
merged_df_residue <- left_join(merged_df_residue, wilcox_test_results, by = "mutation_position") %>%
mutate(
hotspot = !is.na(p_adj) & p_adj < 0.05,
hotspot_label = ifelse(hotspot, as.character(mutation_position), NA)
)
merged_df_residue %>% filter(hotspot == TRUE) %>% pull(mutation_position)
## [1] 76 82 85 86 118 119 122 123 125 131 134 140 141 142 143 146 148 152 156
## [20] 158 159 160 162 163 164 168 170 171 173 175 178 180 181 183 184 185 186 188
## [39] 189 191 192 195 196 197 199 202 203 228 246 295 445 448 449 450 451 453
half_d <- log(2) / b
half_d #15.82528
## [1] 15.82366
x_vals <- seq(min(merged_df_residue$min_dist_to_ligand),
max(merged_df_residue$min_dist_to_ligand), length.out = 200)
# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
samp <- merged_df_residue[sample(nrow(merged_df_residue), replace = TRUE), ]
fit <- try(nlsLM(abs(loess_residual_avg) ~ a * exp(-b * min_dist_to_ligand),
data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
fit_df_residue <- data.frame(
min_dist_to_ligand = x_vals,
loess_residual_avg = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
lower = apply(boot_preds, 1, quantile, probs = 0.025),
upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
merged_df_residue$site_type <- "Non-orthosteric site"
#merged_df_residue$site_type[abs(merged_df_residue$loess_residual_avg) <= y_cutoff] <- "Null"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% active_sites] <- "ATP-binding site"
merged_df_residue$site_type[merged_df_residue$mutation_position %in% binding_sites] <- "Glucose-binding site"
max(merged_df_residue %>% filter (site_type == "Glucose-binding site") %>% pull(min_dist_to_ligand)) #8.666125
## [1] 8.666125
table(merged_df_residue$site_type)
##
## ATP-binding site Glucose-binding site Non-orthosteric site
## 45 22 378
nrow(merged_df_residue) #445
## [1] 445
# Plot with fitted curve
p23 <- ggplot(merged_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg))) +
# First, plot NULL points separately with alpha = 0.2
geom_point(aes(color = site_type), size = 1) +
geom_text_repel(data = merged_df_residue %>% filter(hotspot) %>% filter (site_type != "ATP-binding site"), aes(label = hotspot_label, color = site_type), size = 3.5, max.overlaps = 50) +
geom_ribbon(data = fit_df_residue,
aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(loess_residual_avg)),
inherit.aes = FALSE, color = "black", size = 1) +
scale_color_manual(values = c(
"Null" = "grey",
"Non-orthosteric site" = "darkgreen",
"ATP-binding site" = "cyan",
"Glucose-binding site" = "orange"
)) +
theme_classic() +
geom_vline(xintercept = 8.666125, linetype = "dashed", color = "slategrey") +
labs(
title = "GCK: per-residue allosteric decay",
subtitle = "445 residues",
x = "Minimal distance to glucose",
y = "|Median activity-abundance residual|",
color = ""
) + theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p23 <- ggMarginal(
p23,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
p23
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_decay1.pdf",
plot = p23, width = 5, height = 5, dpi = 300)
p23

lm_model <- lm(log(abs(loess_residual_avg)) ~ min_dist_to_ligand, data = merged_df_residue)
summary(lm_model)
##
## Call:
## lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand,
## data = merged_df_residue)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4024 -0.3130 0.1089 0.4467 1.5179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.173587 0.085822 -2.023 0.0437 *
## min_dist_to_ligand -0.045660 0.004112 -11.104 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6397 on 443 degrees of freedom
## Multiple R-squared: 0.2177, Adjusted R-squared: 0.2159
## F-statistic: 123.3 on 1 and 443 DF, p-value: < 2.2e-16
# Call:
# lm(formula = log(abs(loess_residual_avg)) ~ min_dist_to_ligand,
# data = merged_df_residue)
#
# Residuals:
# Min 1Q Median 3Q Max
# -3.4024 -0.3130 0.1089 0.4467 1.5179
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.173587 0.085822 -2.023 0.0437 *
# min_dist_to_ligand -0.045660 0.004112 -11.104 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.6397 on 443 degrees of freedom
# Multiple R-squared: 0.2177, Adjusted R-squared: 0.2159
# F-statistic: 123.3 on 1 and 443 DF, p-value: < 2.2e-16
merged_df <- merge(gck_df, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#nrow(merged_df) #8255
merged_df <- merged_df %>% dplyr::select(-sequence)
merged_df <- merged_df[!is.na(merged_df$min_dist_to_ligand),]
merged_df_neg <- merged_df %>% filter(residuals <= 0)
nrow(merged_df_neg) #2003
## [1] 4417
# Fit exponential decay: y = a * exp(-b * x) + c
exp_model <- nls(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
data = merged_df_neg,
start = list(a = 1, b = 0.1))
exp_model
## Nonlinear regression model
## model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
## data: merged_df_neg
## a b
## 0.90290 0.03361
## residual sum-of-squares: 431
##
## Number of iterations to convergence: 4
## Achieved convergence tolerance: 2.906e-07
# Nonlinear regression model
# model: abs(residuals) ~ a * exp(-b * min_dist_to_ligand)
# data: merged_df
# a b
# 0.90290 0.03361
# residual sum-of-squares: 431
#
# Number of iterations to convergence: 4
# Achieved convergence tolerance: 2.906e-07
a <- coef(exp_model)[["a"]]
b <- coef(exp_model)[["b"]]
merged_df_neg <- merged_df_neg %>%
mutate(
expected_res = a * exp(-b * min_dist_to_ligand),
observed_abs_res = abs(residuals),
interpolated_upper = approx(x = fit_df_residue$min_dist_to_ligand,
y = fit_df_residue$upper,
xout = min_dist_to_ligand,
rule = 2)$y,
sig_above_curve = observed_abs_res > interpolated_upper
)
# 4. Get mutations significantly above the curve
significant_mutations <- merged_df_neg %>%
filter(sig_above_curve)
nrow(merged_df_neg)
## [1] 4417
merged_df_neg$site_type <- "Non-orthosteric site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% active_sites] <- "ATP-binding site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% binding_sites] <- "Glucose-binding site"
merged_df_neg$pathogenic_status <- ifelse(
merged_df_neg$clinvar_clinical_significance %in% c("pathogenic", "likely_pathogenic"),
"Pathogenic", "Other"
)
table(merged_df_neg$site_type)
##
## ATP-binding site Glucose-binding site Non-orthosteric site
## 683 346 3388
half_d <- log(2) / b #20.62324
half_d
## [1] 20.62314
merged_df_neg$site_type <- "Non-orthosteric site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% active_sites] <- "ATP-binding site"
merged_df_neg$site_type[merged_df_neg$mutation_position %in% binding_sites] <- "Glucose-binding site"
x_vals <- seq(min(merged_df_neg$min_dist_to_ligand),
max(merged_df_neg$min_dist_to_ligand), length.out = 200)
# --- Bootstrapping for confidence intervals ---
set.seed(11)
boot_params <- replicate(1000, {
samp <- merged_df_neg[sample(nrow(merged_df_neg), replace = TRUE), ]
fit <- try(nlsLM(abs(residuals) ~ a * exp(-b * min_dist_to_ligand),
data = samp, start = list(a = 1, b = 0.1)), silent = TRUE)
if (inherits(fit, "try-error")) c(NA, NA) else coef(fit)
})
boot_params <- t(boot_params)[complete.cases(t(boot_params)), ]
boot_preds <- apply(boot_params, 1, function(p) p[1] * exp(-p[2] * x_vals))
fit_df_residue <- data.frame(
min_dist_to_ligand = x_vals,
residuals = predict(exp_model, newdata = data.frame(min_dist_to_ligand = x_vals)),
lower = apply(boot_preds, 1, quantile, probs = 0.025),
upper = apply(boot_preds, 1, quantile, probs = 0.975)
)
merged_df_neg$pathogenic_status <- ifelse(
merged_df_neg$clinvar_clinical_significance %in% c("pathogenic", "likely_pathogenic"),
"Pathogenic", "Other"
)
# Plot
p24 <- ggplot(merged_df_neg, aes(x = min_dist_to_ligand, y = abs(residuals))) +
# Base layer: all points
geom_point(aes(color = site_type, alpha = pathogenic_status, shape = pathogenic_status), size = 2) +
geom_ribbon(data = fit_df_residue,
aes(x = min_dist_to_ligand, ymin = lower, ymax = upper),
fill = "grey70", alpha = 0.3, inherit.aes = FALSE) +
geom_line(data = fit_df_residue, aes(x = min_dist_to_ligand, y = abs(residuals)),
inherit.aes = FALSE, color = "black", size = 1) +
geom_text_repel(
data = merged_df_neg %>% filter(sig_above_curve) %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type != "ATP-binding site"),
aes(label = mutant, color = site_type),
size = 3.5, max.overlaps = Inf, box.padding = 0.4, point.padding = 0.3
) +
# Manual color palette for site type
scale_color_manual(values = c(
#"Null" = "grey",
"Non-orthosteric site" = "darkgreen",
"ATP-binding site" = "cyan",
"Glucose-binding site" = "orange"
)) +
# Transparency scale
scale_alpha_manual(values = c("Other" = 0.1, "Pathogenic" = 1)) +
# Reference lines
geom_vline(xintercept = 8.666125, linetype = "dashed", color = "slategrey") +
theme_classic() +
labs(
title = "GCK: per-mutation allosteric decay",
x = "Minimal distance to glucose",
y = "|Activity-abundance residual|",
color = ""
) + theme(legend.position = "none")
p24 <- ggMarginal(
p24,
type = "density",
margins = "both",
groupColour = FALSE,
groupFill = FALSE,
size = 10,
colour = "grey",
fill = "lightgrey"
)
ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_decay2.pdf",
plot = p24, width = 5, height = 5, dpi = 300)
p24

# merged_df_neg$pathogenic_status <- ifelse(
# merged_df_neg$clinvar_clinical_significance %in% c("pathogenic","likely_pathogenic"),
# "Pathogenic", "Other"
# )
#
# merged_df_neg$curve_residual <- merged_df_neg$observed_abs_res - merged_df_neg$expected_res
#
# merged_df_neg_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
# filter(site_type == "Non-orthosteric site")
# nrow(merged_df_neg_patho_nonortho) #72
#
# merged_df_neg_non_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
# filter(site_type == "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_nonortho) #3316
#
# wilcox.test(merged_df_neg_patho_nonortho$curve_residual, merged_df_neg_non_patho_nonortho$curve_residual)
# # 1. Combine data
# plot_df <- bind_rows(
# merged_df_neg_patho_nonortho %>%
# mutate(group = "Pathogenic"),
# merged_df_neg_non_patho_nonortho %>%
# mutate(group = "Non-pathogenic")
# )
#
# # 2. Summary stats for labeling
# label_df <- plot_df %>%
# group_by(group) %>%
# summarise(
# n_label = paste0("n = ", n()),
# median_val = median(curve_residual, na.rm = TRUE)
# )
# plot_df$group <- factor(plot_df$group, levels = c("Pathogenic", "Non-pathogenic"))
#
# # 3. Plot
# p_violin <- ggplot(plot_df, aes(x = group, y = curve_residual, fill = group)) +
# geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
# geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.6, color = "lightgrey") +
# stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
# stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
# fill = "black", color = "black", stroke = 0.7) +
#
# # Add n labels
# geom_text(
# data = label_df,
# aes(x = group, y = max(plot_df$curve_residual, na.rm = TRUE) + 0.2, label = n_label),
# inherit.aes = FALSE,
# size = 4
# ) +
#
# # Add median labels
# geom_text(
# data = label_df,
# aes(x = group, y = median_val + 0.15, label = sprintf("%.2f", median_val)),
# inherit.aes = FALSE,
# size = 6
# ) +
#
# labs(
# x = "",
# y = "Residual to exponential curve",
# title = "Non-orthosteric Variants"
# ) +
# theme_classic(base_size = 14) +
# scale_fill_manual(values = c(
# "Pathogenic" = "indianred3",
# "Non-pathogenic" = "tan3"
# )) +
# theme(legend.position = "none") +
#
# # Wilcoxon significance
# geom_signif(comparisons = list(c("Pathogenic", "Non-pathogenic")),
# map_signif_level = FALSE,
# test = "wilcox.test",
# tip_length = 0.01)
#
# # Show plot
# p_violin
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
# filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho) #25
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
# filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_ortho) #1004
# wilcox.test(merged_df_neg_patho_ortho$curve_residual, merged_df_neg_non_patho_ortho$curve_residual)
# # 1. Combine data
# plot_df2 <- bind_rows(
# merged_df_neg_patho_ortho %>%
# mutate(group = "Pathogenic"),
# merged_df_neg_non_patho_ortho %>%
# mutate(group = "Non-pathogenic")
# )
#
# # 2. Summary stats for labeling
# label_df2 <- plot_df2 %>%
# group_by(group) %>%
# summarise(
# n_label = paste0("n = ", n()),
# median_val = median(curve_residual, na.rm = TRUE)
# )
# plot_df2$group <- factor(plot_df2$group, levels = c("Pathogenic", "Non-pathogenic"))
#
# # 3. Plot
# p_violin2 <- ggplot(plot_df2, aes(x = group, y = curve_residual, fill = group)) +
# geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
# geom_boxplot(width = 0.5, outlier.shape = NA, alpha = 0.6, color = "lightgrey") +
# stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
# stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
# fill = "black", color = "black", stroke = 0.7) +
#
# # Add n labels
# geom_text(
# data = label_df2,
# aes(x = group, y = max(plot_df2$curve_residual, na.rm = TRUE) + 0.2, label = n_label),
# inherit.aes = FALSE,
# size = 4
# ) +
#
# # Add median labels
# geom_text(
# data = label_df2,
# aes(x = group, y = median_val + 0.5, label = sprintf("%.2f", median_val)),
# inherit.aes = FALSE,
# size = 6
# ) +
#
# labs(
# x = "",
# y = "Residual to exponential curve",
# title = "GCK:Orthosteric Variants"
# ) +
# theme_classic(base_size = 14) +
# scale_fill_manual(values = c(
# "Pathogenic" = "indianred3",
# "Non-pathogenic" = "tan3"
# )) +
# theme(legend.position = "none") +
#
# # Wilcoxon significance
# geom_signif(comparisons = list(c("Pathogenic", "Non-pathogenic")),
# map_signif_level = FALSE,
# test = "wilcox.test",
# tip_length = 0.01)
#
# # Show plot
# p_violin2
# plot <- plot_grid(p_violin2, p_violin, ncol = 1, nrow =2)
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin6.pdf",
# plot = plot, width = 3, height = 6, dpi = 300)
# plot
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>%
# filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho) #25
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>%
# filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_non_patho_ortho) #1004
# wilcox.test(merged_df_neg_patho_ortho$curve_residual, merged_df_neg_non_patho_ortho$curve_residual)
# table(merged_df_neg$pathogenic_status)
#
# merged_df_neg_patho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic")
# table(merged_df_neg_patho$sig_above_curve)
#
# merged_df_neg_patho_ortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type != "Non-orthosteric site")
# nrow(merged_df_neg_patho_ortho )
# table(merged_df_neg_patho_ortho$sig_above_curve)
# #18/(18+7) = 0.72
#
# merged_df_neg_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status == "Pathogenic") %>% filter(site_type == "Non-orthosteric site")
# table(merged_df_neg_patho_nonortho$sig_above_curve)
# #22/(50+22) = 0.3055556
# plot_df <- merged_df_neg %>%
# filter(pathogenic_status == "Pathogenic") %>%
# mutate(
# group = ifelse(site_type == "Non-orthosteric site", "Non-orthosteric", "Orthosteric"),
# group = factor(group, levels = c("Orthosteric", "Non-orthosteric"))
# ) %>%
# group_by(group, sig_above_curve) %>%
# summarise(count = n(), .groups = "drop") %>%
# group_by(group) %>%
# mutate(percentage = 100 * count / sum(count))
#
# # Plot
# p2 <- ggplot(plot_df, aes(x = group, y = percentage, fill = sig_above_curve)) +
# geom_col(width = 0.6) +
# geom_text(
# aes(label = sprintf("%d (%.0f%%)", count, percentage)),
# position = position_stack(vjust = 0.5),
# size = 4
# ) +
# scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
# scale_fill_manual(
# values = c("TRUE" = "indianred3", "FALSE" = "grey80"),
# labels = c("Not above curve", "Above curve")
# ) +
# labs(
# x = "",
# y = "Variant percentage (%)",
# title = "GCK: 97 Pathogenic Variants",
# subtitle = "With negative activity-abundance residuals",
# fill = "") +
# theme_classic(base_size = 14) + theme(legend.position = "bottom")
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_boxplot1.pdf",
# plot = p2, width = 4, height = 5, dpi = 300)
# p2
# table(merged_df_neg$pathogenic_status)
#
# merged_df_neg_non_patho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic")
# table(merged_df_neg_patho$sig_above_curve)
# nrow(merged_df_neg_non_patho)
# merged_df_neg_non_patho_ortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>% filter(site_type != "Non-orthosteric site")
# table(merged_df_neg_non_patho_ortho$sig_above_curve)
#
# merged_df_neg_non_patho_nonortho <- merged_df_neg %>% filter(pathogenic_status != "Pathogenic") %>% filter(site_type == "Non-orthosteric site")
# table(merged_df_neg_non_patho_nonortho$sig_above_curve)
# plot_df <- merged_df_neg %>%
# filter(pathogenic_status != "Pathogenic") %>%
# mutate(
# group = ifelse(site_type == "Non-orthosteric site", "Non-orthosteric", "Orthosteric"),
# group = factor(group, levels = c("Orthosteric", "Non-orthosteric"))
# ) %>%
# group_by(group, sig_above_curve) %>%
# summarise(count = n(), .groups = "drop") %>%
# group_by(group) %>%
# mutate(percentage = 100 * count / sum(count))
#
# # Plot
# p3 <- ggplot(plot_df, aes(x = group, y = percentage, fill = sig_above_curve)) +
# geom_col(width = 0.6) +
# geom_text(
# aes(label = sprintf("%d (%.0f%%)", count, percentage)),
# position = position_stack(vjust = 0.5),
# size = 4
# ) +
# scale_y_continuous(limits = c(0, 100), expand = c(0, 0)) +
# scale_fill_manual(
# values = c("TRUE" = "indianred3", "FALSE" = "grey80"),
# labels = c("Not above curve", "Above curve")
# ) +
# labs(
# x = "",
# y = "Variant percentage (%)",
# title = "GCK: 4320 Non-pathogenic Variants",
# subtitle = "With negative activity-abundance residuals",
# fill = "") +
# theme_classic(base_size = 14) + theme(legend.position = "bottom")
# ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_boxplot2.pdf",
# plot = p3, width = 4, height = 5, dpi = 300)
# p3
lm_model <- lm(log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
summary(lm_model)
##
## Call:
## lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7944 -0.4913 0.2377 0.7202 2.7754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.588428 0.035433 -16.61 <2e-16 ***
## min_dist_to_ligand -0.021230 0.001693 -12.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.114 on 7967 degrees of freedom
## Multiple R-squared: 0.01935, Adjusted R-squared: 0.01923
## F-statistic: 157.2 on 1 and 7967 DF, p-value: < 2.2e-16
# Call:
# lm(formula = log(abs(residuals)) ~ min_dist_to_ligand, data = merged_df)
#
# Residuals:
# Min 1Q Median 3Q Max
# -8.7790 -0.4103 0.2504 0.6455 1.8923
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.233775 0.039517 -5.916 3.55e-09 ***
# min_dist_to_ligand -0.044398 0.002099 -21.154 < 2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 1.001 on 4415 degrees of freedom
# Multiple R-squared: 0.09203, Adjusted R-squared: 0.09183
# F-statistic: 447.5 on 1 and 4415 DF, p-value: < 2.2e-16
nrow(merged_df) #4417
## [1] 7969
#merged_df <- merge(gck_df, fil_protein_ca, by.x="mutation_position", by.y = "resno", all.x = TRUE)
#nrow(merged_df) #8255
#merged_df <- merged_df %>% dplyr::select(-sequence)
merged_df <- gck_df
nrow(merged_df) #8255
## [1] 8255
#merged_df <- merged_df[!is.na(merged_df$min_dist_to_ligand),]
#nrow(merged_df) #7969
merged_df$pathogenic_status <- ifelse(
merged_df$clean_condition %in% c(
"HH", "MODY",
"Monogenic diabetes"),
"Pathogenic", "Other"
)
merged_df_int <- merged_df %>% filter(mutation_position %in% active_positions)
nrow(merged_df_int) #1031
## [1] 1031
table(merged_df_int$pathogenic_status)
##
## Other Pathogenic
## 977 54
# Other Pathogenic
# 977 54
merged_df_out <- merged_df %>% filter(!mutation_position %in% active_positions)
nrow(merged_df_out) #7224
## [1] 7224
table(merged_df_out$pathogenic_status)
##
## Other Pathogenic
## 7054 170
# Other Pathogenic
# 7054 170
table(merged_df$clean_condition)
##
## HH Mixed MODY Monogenic diabetes
## 3 14 57 164
## Not provided Other
## 36 3
# 1. Set up
set.seed(123)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(merged_df_int %>% filter(pathogenic_status == "Pathogenic"))
# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_int %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"
# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_int %>% filter(pathogenic_status != "Pathogenic") %>%
filter(!(mutant %in% patho_df$mutant))
bootstrap_medians <- vector("numeric", length = n_boot)
# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)
# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)
for (i in 1:n_boot) {
for (j in 1:n_patho) {
bin_j <- patho_df$bin[j]
candidates <- bin_lookup[[as.character(bin_j)]]
if (!is.null(candidates) && length(candidates) > 0) {
bootstrap_matrix[i, j] <- sample(candidates, 1)
}
}
}
# Summarize into a dataframe
boot_df <- data.frame(
group = "Random abundance-matched",
residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE) # Mean across matches per bootstrap
)
# Combine with patho residuals
plot_df <- bind_rows(
patho_df %>% dplyr::select(group, residuals),
boot_df
)
label_df <- plot_df %>%
group_by(group) %>%
summarise(
n = n(),
median_val = median(residuals),
y_max = max(residuals),
.groups = "drop"
)
label_df <- label_df %>%
mutate(n_label = case_when(
group == "Random abundance-matched" ~ "bootstrapped 1000 times",
TRUE ~ paste0("n = ", n)
))
# Plot
p_fast <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = group, y = 2.5, label = n_label),
inherit.aes = FALSE,
size = 4) +
geom_text(
data = label_df,
aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
labs(
x = "",
y = "Activity-abundance residuals",
title = "Orthosteric variants"
) +
theme_classic(base_size = 14) +
scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
theme(legend.position = "none") +
geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
map_signif_level = FALSE,
test = "wilcox.test",
tip_length = 0.01)
p_fast

# 1. Set up
set.seed(11)
n_boot <- 1000
match_window <- 0.5
n_patho <- nrow(merged_df_out %>% filter(pathogenic_status == "Pathogenic"))
# 2. Get abundance/residuals from pten_patho
patho_df <- merged_df_out %>% filter(pathogenic_status == "Pathogenic") %>% dplyr::select(DMS_score_abundance, residuals)
patho_df$group <- "Pathogenic"
# 3. Bootstrap sampling from non-patho pool with abundance matching
non_patho_pool <- merged_df_out %>% filter(pathogenic_status != "Pathogenic")
nrow(non_patho_pool)
## [1] 7054
bootstrap_medians <- vector("numeric", length = n_boot)
# Pre-group non-patho pool into bins
non_patho_pool <- non_patho_pool %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Bin pathogenic variants accordingly
patho_df <- patho_df %>%
mutate(bin = cut(DMS_score_abundance, breaks = seq(-0.5, 2, by = match_window)))
# Create lookup table for fast sampling
bin_lookup <- split(non_patho_pool$residuals, non_patho_pool$bin)
# Bootstrap matrix
bootstrap_matrix <- matrix(NA, nrow = n_boot, ncol = n_patho)
for (i in 1:n_boot) {
for (j in 1:n_patho) {
bin_j <- patho_df$bin[j]
candidates <- bin_lookup[[as.character(bin_j)]]
if (!is.null(candidates) && length(candidates) > 0) {
bootstrap_matrix[i, j] <- sample(candidates, 1)
}
}
}
# Summarize into a dataframe
boot_df <- data.frame(
group = "Random abundance-matched",
residuals = apply(bootstrap_matrix, 1, median, na.rm = TRUE) # Mean across matches per bootstrap
)
# Combine with patho residuals
plot_df <- bind_rows(
patho_df %>% dplyr::select(group, residuals),
boot_df
)
label_df <- plot_df %>%
group_by(group) %>%
summarise(
n = n(),
median_val = median(residuals),
y_max = max(residuals),
.groups = "drop"
)
label_df <- label_df %>%
mutate(n_label = case_when(
group == "Random abundance-matched" ~ "bootstrapped 1000 times",
TRUE ~ paste0("n = ", n)
))
# Plot
p_fast2 <- ggplot(plot_df, aes(x = group, y = residuals, fill = group)) +
geom_violin(trim = FALSE, scale = "width", alpha = 0.8, color = NA) +
geom_jitter(width = 0.15, size = 2, alpha = 0.7, color = "lightgrey") +
stat_summary(fun = median, geom = "crossbar", width = 0.4, color = "black", fatten = 1) +
stat_summary(fun = median, geom = "point", shape = 23, size = 2.5,
fill = "black", color = "black", stroke = 0.7) +
geom_text(
data = label_df,
aes(x = group, y = 1.5, label = n_label),
inherit.aes = FALSE,
size = 4) +
geom_text(
data = label_df,
aes(x = group, y = median_val + 0.25, label = sprintf(" %.2f", median_val)),
inherit.aes = FALSE,
size = 6
) +
labs(
x = "",
y = "Activity-abundance residual",
title = "Non-orthosteric variants"
) +
theme_classic(base_size = 14) +
scale_fill_manual(values = c("Pathogenic" = "cornflowerblue", "Random abundance-matched" = "darkolivegreen3")) +
theme(legend.position = "none") +
geom_signif(comparisons = list(c("Pathogenic", "Random abundance-matched")),
map_signif_level = FALSE,
test = "wilcox.test",
tip_length = 0.01)
p_fast2

fig5G<- plot_grid(p_fast, p_fast2, ncol=1, nrow=2)
fig5G

ggsave("/Users/xl7/Documents/0.Projects/01.protein-seq-evo-v1/figs/panels/fig5_violin6.pdf",
plot = fig5G, width = 3, height = 6, dpi = 300)